Harmonise CURF with AES 2022

Author
Affiliation

University of Sydney

Published

2:13PM 12 March 2023

Code
import { aq, op } from "@uwdata/arquero"

1 CURF

The CURF is an anonymised selection of microdata from the Australian Census of Population and “Housing”, utilizing a random 1% of person-level records from the Census. At the time of writing data from the 2021 Census is not yet available; we use the 2016 “1% Basic CURF”.

We perform a one-time read the CURF from csv and re-write a local copy as fst for fast-loading:

Code
curf <-
  read_csv(file = here("../data/census/micro-data/BCSF16_person_new.csv"))
write_fst(curf, path = here("data/2022/curf/curf_2016.fst"))

Load fst version of the CURF file:

Code
curf <- read_fst(here("data/2022/curf/curf_2016.fst"))
load(here("data/2022/curf/sa4_codes.RData"))

1.1 Geo-codes and covariates

The CURF geocodes individuals in groupings of SA4s. We build a data set of these groupings and the corresponding SA4s.

Code
sa4_codes <- readxl::read_xls(
  path = here(
    "../data/census/micro-data/data item list - 1 percent basic curf-3.xls"
  ),
  sheet = "Place of Enumeration",
  range = "C19:E160",
  col_names = c("AREAENUM", "description", "SA4_class_code")
) %>%
  janitor::remove_empty(., "rows") %>%
  fill(AREAENUM, description)
save("sa4_codes", file = here("data/2022/curf/sa4_codes.RData"))

1.2 Cross walk from AES geocodes (SA2s and postcodes) to CURF SA4

AES supplies postcodes for all respondents (at least in the restricted access version of the file) and some SA1 information.

Code
library(sf)
aus_geo <- read_sf(
  here(
    "../data/census/geography/ASGS_2021_MAIN_STRUCTURE_GPKG_GDA2020/ASGS_2021_Main_Structure_GDA2020.gpkg"
    )
)

postcodes <- aus_geo %>% 
  st_drop_geometry() %>% 
  select(MB_CODE_2021,
         starts_with("SA4"),
         area = AREA_ALBERS_SQKM) %>% 
  left_join(
    readxl::read_xlsx(
      here("../data/census/geography/POA_2021_AUST.xlsx")
      ),
    by = "MB_CODE_2021") %>% 
  group_by(POA_NAME_2021,SA4_NAME_2021) %>%
  summarise(area = sum(area)) %>%
  ungroup() %>% 
  group_by(POA_NAME_2021) %>%
  mutate(area_per = area/sum(area)*100) %>% 
  slice_max(order_by = area_per) %>% 
  ungroup()
  
sa2 <- aus_geo %>%
  st_drop_geometry() %>% 
  select(SA2_NAME_2021,SA4_NAME_2021) %>%
  distinct() %>% 
  left_join(sa4_codes %>%
              mutate(SA4_NAME_2021 = str_squish(str_remove(SA4_class_code,
                                                           pattern = "^[0-9]{1,}"))) %>% 
              select(AREAENUM,SA4_NAME_2021),
            by = "SA4_NAME_2021")

sa_to_postcodes <- postcodes %>% 
  left_join(sa2 %>% 
              distinct(SA4_NAME_2021,AREAENUM),
            by = c("SA4_NAME_2021"))

save("sa2","sa_to_postcodes",
     file = here("data/2022/SA2/sa2.RData"))

Join SA2 to SA4s.

Code
load(here("data/2022/curf/sa4_codes.RData"))
load(here("data/2022/SA2/sa2.RData"))
sa2 <- sa2 %>% 
  left_join(sa4_codes %>% 
              mutate(SA4_NAME_2021 = str_squish(str_remove(SA4_class_code,
                                                           pattern = "^[0-9]{1,}"))),
            by = "SA4_NAME_2021"
            )

1.3 CURF recodes

Variable labels and recodes, drawing on XLS file accompanying CURF:

Code
curf_meta <- 
  tribble(
    ~"var", ~"description",
  "ABSFID", "Family Identifier",
  "ABSHID", "Household Identifier",
  "ABSPID", "Person Identifier",
  "AGEP",     "Age of persons",
  "ANC1P",  "Ancestry 1st Response",
  "ANC2P",  "Ancestry 2nd Response",
  "AREAENUM",   "Area of Enumeration",
  "ASSNP",  "Core Activity Need for Assistance",
  "BPFP",   "Birthplace of Female Parent",
  "BPLP",   "Country of Birth of Person",
  "BPMP",   "Birthplace of Male Parent",
  "CHCAREP",    "Unpaid Child Care",
  "CITP",   "Australian Citizenship",
  "CTPP",   "Child Type",
  "DOMP",   "Unpaid Domestic Work: Number of Hours",
  "DWIP",   "Dwelling Indicator for Persons",
  "EMPP",   "Number of Employees",
  "ENGP",   "Proficiency in Spoken English",
  "FTPP",   "Form Type",
  "GNGP",   "Public/Private Employer Indicator",
  "HRSP",   "Hours Worked",
  "HSCP",   "Highest Year of School Completed",
  "INCP",   "Total Personal Income (weekly)",
  "INDP",   "Industry of Employment",
  "INGP",   "Indigenous Status",
  "LANP",   "Language Spoken at Home",
  "LFSP",   "Labour Force Status",
  "MSTP",   "Registered Marital Status",
  "MTWP",   "Method of Travel to Work",
  "OCCP",   "Occupation",
  "QALFP", "Non-School Qualification: Field of Study",
  "QALLP", "Non-School Qualification: Level of Education",
  "REGU1P", "Region of Usual Residence One Year Ago",
  "REGU5P", "Region of Usual Residence Five Years Ago",
  "REGUCP", "Region of Usual Residence on Census Night",
  "RELP",   "Religious Affiliation",
  "RLHP",   "Relationship in Household",
  "RPIP",   "Family/Household Reference Person Indicator",
  "SEXP",   "Sex",
  "SIEMP", "Status in employment",
  "STATEUR", "State of Usual Residence",
  "STUP",   "Full/Part Time Student Status",
  "TISP",   "Number of Children Ever Born",
  "TYPP",   "Type of Educational Institution Attending",
  "UAI1P",  "Usual Address One Year Ago Indicator",
  "UAI5P",  "Usual Address Five Years Ago Indicator",
  "UAICP",  "Usual Address Indicator Census Night",
  "UNCAREP", "Unpaid Assistance to a Person with a Disability",
  "VOLWP",  "Voluntary Work for an Organisation or Group",
  "YARP",   "Year of Arrival in Australia"
)

## recodes
curf <- curf %>% 
  
  select(-ABSFID,-ABSHID,-ABSPID) %>% 
  
  mutate(
    
    AGEP = case_match(
      AGEP,
      0:24  ~ as.character(AGEP),
      25 ~ "25-29",
      26 ~ "30-34",
      27 ~ "35-39",
      28 ~ "40-44",
      29 ~ "45-49",
      30 ~ "50-54",
      31 ~ "55-59",
      32 ~ "60-64",
      33 ~ "65-69",
      34 ~ "70-74",
      35 ~ "75-79",
      36 ~ "80-84",
      37 ~ "85>"
    ),
    
    AGEP = factor(
      AGEP,
      levels = c(
        as.character(
          0:24),
          "25-29",
          "30-34",
          "35-39",
          "40-44",
          "45-49",
          "50-54",
          "55-59",
          "60-64",
          "65-69",
          "70-74",
          "75-79",
          "80-84",
          "85>"),
      ordered = TRUE),
    
    ANC1P = case_match(
      ANC1P,
      1 ~ "Australian",
      2 ~ "Australian Aboriginal",
      3 ~ "Maori",
      4 ~ "New Zealander",
      5 ~ "Other Oceanian",
      6 ~ "English",
      7 ~ "Scottish",
      8 ~ "Irish",
      9 ~ "Dutch",
      10 ~ "German",
      11    ~ "Other North-West European",
      12    ~ "Italian",
      13    ~ "Maltese",
      14    ~ "Spanish",
      15    ~ "Croatian",
      16    ~ "Greek",
      17    ~ "Macedonian",
      18    ~ "Serbian",
      19    ~ "Polish",
      20    ~ "Russian",
      21    ~ "Other Southern and Eastern European",
      22    ~ "Lebanese",
      23    ~ "Iranian",
      24    ~ "Turkish",
      25    ~ "Other North African and Middle Eastern",
      26    ~ "Thai",
      27    ~ "Vietnamese",
      28    ~ "Filipino",
      29    ~ "Other South-East Asian",
      30    ~ "Chinese",
      31    ~ "Korean",
      32    ~ "Other North-East Asian",
      33    ~ "Indian",
      34    ~ "Nepalese",
      35    ~ "Pakistani",
      36    ~ "Sri Lankan",
      37    ~ "Other Southern and Central Asian",
      38    ~ "People of the Americas",
      39    ~ "South African",
      40    ~ "Other Sub-Saharan African",
      41    ~ "Inadequately described, Not stated, Other supplementary codes",
      42    ~ "Overseas visitor"
    ),
    
    ANCP_collapse = case_when(
      ANC1P == "English"  ~ "English",
      ANC1P == "Irish" ~ "Irish",
      ANC1P == "Scottish" ~ "Scottish",
      ANC1P == "Italian" ~ "Italian",
      ANC1P == "German" ~ "German",
      ANC1P == "Chinese" ~ "Chinese",
      grepl("^Australian",ANC1P) ~ "Australian",
      ANC1P == "Inadequately described, Not stated, Other supplementary codes" ~ NA,
      TRUE ~ "Other"
    ),
    
    ANCP_collapse = factor(ANCP_collapse),
    
    ANC2P = case_match(
      ANC2P,
      1 ~ "Australian",
      2 ~ "Australian Aboriginal",
      3 ~ "Maori",
      4 ~ "New Zealander",
      5 ~ "Other Oceanian",
      6 ~ "English",
      7 ~ "Scottish",
      8 ~ "Irish",
      9 ~ "Dutch",
      10 ~ "German",
      11    ~ "Other North-West European",
      12    ~ "Italian",
      13    ~ "Maltese",
      14    ~ "Spanish",
      15    ~ "Croatian",
      16    ~ "Greek",
      17    ~ "Macedonian",
      18    ~ "Serbian",
      19    ~ "Polish",
      20    ~ "Russian",
      21    ~ "Other Southern and Eastern European",
      22    ~ "Lebanese",
      23    ~ "Iranian",
      24    ~ "Turkish",
      25    ~ "Other North African and Middle Eastern",
      26    ~ "Thai",
      27    ~ "Vietnamese",
      28    ~ "Filipino",
      29    ~ "Other South-East Asian",
      30    ~ "Chinese",
      31    ~ "Korean",
      32    ~ "Other North-East Asian",
      33    ~ "Indian",
      34    ~ "Nepalese",
      35    ~ "Pakistani",
      36    ~ "Sri Lankan",
      37    ~ "Other Southern and Central Asian",
      38    ~ "People of the Americas",
      39    ~ "South African",
      40    ~ "Other Sub-Saharan African",
      41    ~ "Inadequately described, Not stated, Other supplementary codes",
      42    ~ "Overseas visitor",
      43 ~ "Not applicable"
    ),
    
    BPLP = case_match(
      BPLP,
      1 ~ "Australia",
      2 ~ "New Zealand",
      3 ~ "United Kingdom, Channel Islands & Isle of Man",
      4 ~ "Germany",
      5 ~ "Italy",
      6 ~ "Greece",
      7 ~ "Vietnam",
      8 ~ "Philippines",
      9 ~ "China (excl. SARs and Taiwan)",
      10 ~ "India",
      11 ~ "Oceania and Antarctica (excl. Australia, New Zealand)",
      12    ~ "North-West Europe (excl. United Kingdom, Channel Islands, Isle of Man, Germany)",
      13    ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      14    ~ "North Africa and the Middle East",
      15    ~  "South-East Asia (excl. Vietnam, Philippines)",
      16    ~ "North-East Asia (excl. China)",
      17    ~ "Southern and Central Asia (excl. India)",
      18    ~ "Americas",
      19    ~ "Sub-Saharan Africa",
      20    ~ NA,
      21    ~ "Overseas visitor"
    ),
    
    born_country_three = case_when(
      BPLP == "Australia" ~ "Australia",
      BPLP %in% c("New Zealand",
                  "United Kingdom, Channel Islands & Isle of Man") ~ "UK/NZ",
      BPLP == "Inadequately described, At sea, Not stated" ~ NA,
      is.na(BPLP) ~ NA,
      TRUE ~ "Other"
    ),
    
    born_country_three = factor(born_country_three),
    
    CITP = case_match(
      CITP,
      1 ~ "Australian",
      2 ~ "Not Australian",
      3 ~ NA,
      4 ~ "Not Australian"
    ),
    
    HSCP = case_match(
      HSCP,
      1 ~ "Year 12 or equivalent",
      2 ~ "Year 11 or equivalent",
      3 ~ "Year 10 or below (includes Did not go to school)",
      4 ~ NA,
      5 ~ "Not applicable",
      6 ~   "Overseas visitor"
    ),
    
    INCP_original = case_match(
      INCP,
      1 ~ "Negative income", 
      2 ~ "Nil income", 
      3 ~ "$1-$7,799", 
      4 ~ "$7,800-$15,599",
      5 ~ "$15,600-$20,799", 
      6 ~ "$20,800-$25,999",
      7 ~ "$26,000-$33,799", 
      8 ~ "$33,800-$41,599", 
      9 ~ "$41,600-$51,999",
      10 ~ "$52,000-$64,999",
      11 ~ "$65,000-$77,999",
      12 ~ "$78,000-$90,999",
      13 ~ "$91,000-$103,999",
      14 ~ "$104,000-$155,999",
      15 ~ "$156,000>",
      16 ~ "Not stated",
      17 ~ "Not applicable",
      18 ~ "Overseas visitor" 
    ),
    
    INCP = case_match(
      INCP,
      c(1,2,3,4) ~ "Less than $15,000 per year",
      c(4,5,6) ~ "$15,001 to $25,000 per year",
      c(7) ~ "$25,001 to $35,000 per year",
      8 ~ "$35,001 to $45,000 per year",
      c(9,10) ~ "$45,001 to $60,000 per year",
      11 ~ "$60,001 to $80,000 per year",
      c(12,13) ~ "$80,001 to $100,000 per year",
      14 ~ "$104,000-$155,999",
      15 ~ "$156,000>",
      16 ~ NA,
      17 ~ "Not applicable",
      18 ~ "Overseas visitor" 
    ),
    
    INCP = factor(
      INCP,
      levels = c(
        "Less than $15,000 per year",
        "$15,001 to $25,000 per year",
        "$25,001 to $35,000 per year",
        "$35,001 to $45,000 per year",
        "$45,001 to $60,000 per year",
        "$60,001 to $80,000 per year",
        "$80,001 to $100,000 per year",
        "$104,000-$155,999",
        "$156,000>",
        "Not applicable",
        "Overseas visitor"
      ),
      ordered = TRUE
    ), 
    
    INGP = case_match(
      INGP,
      1 ~ "Non-Indigenous",
      2 ~ "Aboriginal, Torres Strait Islander, or both",
      3 ~ "Not stated",
      4 ~ "Overseas visitor"
      ),
    
    INGP_binary = INGP == "Aboriginal, Torres Strait Islander, or both",
    
    LFSP = case_match(
      LFSP,
      1 ~ "Employed",
      2 ~ "Unemployed",
      3 ~ "Not in the labour force",
      4 ~ "Not stated",
      5 ~ "Not applicable",
      6 ~ "Overseas visitor"
    ),
    
    MSTP = case_match(
      MSTP,
      1 ~ "Never married",
      2 ~   "Widowed",
      c(3,4)    ~ "Divorced or separated",
      5 ~ "Married",
      6 ~ "Not applicable"
    ),
    
    MSTP = factor(MSTP),
    
    OCCP = case_match(
      OCCP,
      1 ~ "Managers",
      2 ~   "Professionals",
      3 ~ "Technicians and Trades Workers",
      4 ~ "Community and Personal Service Workers",
      5 ~ "Clerical and Administrative Workers",
      6 ~ "Sales Workers",
      7 ~ "Machinery Operators and Drivers",
      8 ~ "Labourers",
      9 ~ NA,
      10 ~ NA,
      11 ~ "Not applicable",
      12 ~ "Overseas visitor"
    ),
    
    QALLP = case_match(
      QALLP,
      1 ~ "Postgraduate Degree Level",
      2 ~ "Graduate Diploma and Graduate Certificate Level",
      3 ~ "Bachelor Degree Level",
      4 ~ "Advanced Diploma and Diploma Level",
      5 ~ "Certificate Level",
      6 ~ "Level of education inadequately described",
      7 ~ "Level of education not stated",
      8 ~ "Not applicable",
      9 ~ "Overseas visitor"
    ),
    
    UNI = case_when(
      QALLP %in% c("Postgraduate Degree Level",
                   "Graduate Diploma and Graduate Certificate Level",
                   "Bachelor Degree Level") ~ "Yes",
      TRUE ~ "No"
    ),
    
    RELP = case_match(
      RELP,
      1 ~ "Buddhism",
      2 ~ "Anglican",
      3 ~ "Baptist",
      4 ~ "Catholic",
      5 ~ "Greek Orthodox",
      6 ~ "Presbyterian",
      7 ~ "Uniting Church",
      8 ~ "Other Christian",
      9 ~ "Hinduism",
      10    ~ "Islam",
      11    ~ "Judaism and Other Religions",
      12    ~ "No Religion (so described)",
      13    ~ "Secular Beliefs and Other Spiritual Beliefs",
      14    ~ NA,
      15    ~ "Overseas visitor"
    ),
    
    RELP_collapsed = case_when(
      RELP %in% c("Anglican",
                  "Baptist",
                  "Uniting Church") ~ "Protestant",
      
      RELP %in% c("No Religion (so described)",
                  "Secular Beliefs and Other Spiritual Beliefs") ~ "No Religion (so described)",
                  
      TRUE ~ RELP  
    ),
    
    RELP_collapsed = factor(RELP_collapsed),
    
    SEXP = case_match(
      SEXP,
      1 ~ "Male",
      2 ~ "Female"
    ),
    
    STATEUR = case_match(
      STATEUR,
      1 ~   "NSW",
      2 ~ "VIC",
      3 ~ "QLD",
      4 ~ "SA",
      5 ~ "WA",
      6 ~ "TAS",
      7 ~ "NT",
      8 ~ "ACT",
      9 ~ "2016 Overseas visitor",
      10 ~ "OT, No Usual Address and Migratory/Offshore/Shipping areas"
    )
    
    ) %>% 
  mutate(
    across(
      where(is.double) | where(is.logical),
      ~as.character(.)
      )
    )

areaenum_info <- sa4_codes %>% distinct(AREAENUM,.keep_all = TRUE) %>% select(AREAENUM,description)

curf <- curf %>%
  mutate(
    AREAENUM = factor(
      AREAENUM,
      levels = areaenum_info %>% pull(AREAENUM),
      labels = areaenum_info %>% pull(description),
      ordered = TRUE
      )
    )

1.4 Marginal distributions of variables likely appearing the AES:

Code
curf_marginals <- curf %>% 
  pivot_longer(cols = everything(),
               names_to = "var",
               values_to = "value") %>% 
  count(var,value) %>% 
  group_by(var) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup()
Code
curf_marginals_ojs = transpose(curf_ojs_raw)
curf_marginals_aq = aq.from(curf_marginals_ojs)
curf_meta_aq = aq.from(transpose(curf_meta_raw))

viewof selection = Inputs.select(
  curf_meta_aq,
  {
    value: d => d.var,
    format: d => d.var + ": " + d.description,
    label: "Select variable"
    }
  )

tab = curf_marginals_aq
  .join_left(curf_meta_aq)
  .filter(aq.escape(d => d.var == selection.var))
  .select("value","n","p")
  
tab_shown = Inputs.table(tab,
  {
    header: {
      p: "%"
    },
    format: {
      p: x => d3.format(".1%")(x)
    },
    sort: "p",
    reverse: "true",
    rows: tab.numRows(),
    maxHeight: (tab.numRows()+1.5)*28
  }
)

2 SA4 covariates

The AES is sparse with respect to both SA4s and the agglomerations of SA4s used to geo-code individual records in the CURF. ABS provides data on SA4s from the 2021 Census which we download and combine into a data set of covariates elsewhere, in code/sa4_features.R.

Here we aggregate the SA4 level covariates up to the agglomerations in the CURF denoted with AREAENUM.

Code
load(here("data/2022/sa4/sa4_prcomp.RData"))
load(here("data/2022/sa4/sa4.RData"))

areaenum_covars <- left_join(
  sa4_prcomp,
  sa4_codes %>%
    mutate(SA4_CODE_2021 = as.numeric(str_extract(SA4_class_code,pattern = "^[0-9]{3}"))) %>%
    select(AREAENUM,SA4_CODE_2021),
  by = "SA4_CODE_2021"
  ) %>%
  left_join(
    sa4_features %>%
      select(SA4_CODE_2021,
             n = Australian_citizen_P_G01),
    by = "SA4_CODE_2021"
    ) %>%
  group_by(AREAENUM) %>%
  mutate(w = n/sum(n)) %>% 
  ungroup() %>% 
  select(AREAENUM,SA4_CODE_2021,n,w,everything()) %>% 
  group_by(AREAENUM) %>% 
  summarise(
    across(
       .cols = starts_with("PC") | ends_with("_G02"),
       .fns = ~weighted.mean(x=.x,w=n)
     )
  ) %>% 
  ungroup()

3 AES

We read the 2022 AES. These data are supplied in stata format with accompanying attributes such as var.labels. We recode reported House of Representatives vote to hvote_collapsed.

Code
d2022_attr <- attributes(
  readstata13::read.dta13(
    here("data/2022/aes/2790 AES 2022 Datafile - Weighted Final 2022-10-28 copy.dta"),
    nonint.factors = TRUE,
    generate.factors = TRUE
    )
  )

levs <- d2022_attr$label.table
levs_long <- lapply(levs,
                    function(x){
                      tibble(
                        value = as.character(x),
                        label = names(x)
                        )
                      }
                    ) %>% 
  bind_rows(.id = "var")

 
d2022_meta <- tibble(var = d2022_attr$names,
                     description = d2022_attr$var.labels)

d2022 <- readstata13::read.dta13(
  here(
    "data/2022/aes/2790 AES 2022 Datafile - Weighted Final 2022-10-28 copy.dta"
  ),
  nonint.factors = TRUE,
  convert.factors = TRUE,
  generate.factors = TRUE
) %>%
  as_tibble()

3.1 Occupation codes

We pick up ANZSCO codes from the public release of the data:

Code
d2022_public <- read_csv(
  here("data/2022/aes/aes22_unrestricted_v2.csv")
) %>% select(Respondent_ID = ID,G5_Code)
Rows: 2508 Columns: 368
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (33): ID, A12_Order, A13_Order, B1_other, B9_1_other, B9_2_other, C2_O...
dbl  (334): ID_2019, ID_2016, Mode, STATE, sampsrc, accesstype, PANEL_FLAG, ...
date   (1): IntDate

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
d2022 <- d2022 %>%
  left_join(d2022_public,
            by = "Respondent_ID") %>%
  rename(G5Code_raw = G5_Code.x,
         G5Code = G5_Code.y) %>%
  mutate(G5Code = factor(
    G5Code,
    levels = c(1:8, 9970, 9990, 99999),
    labels = c(
      "Managers",
      "Professionals",
      "Technicians and Trades Workers",
      "Community and Personal Service Workers",
      "Sales Workers",
      "Machinery Operators and Drivers",
      "Labourers",
      "Inadequately described",
      "Not applicable",
      "Not applicable",
      "Item skipped"
      )
    )
  )

3.2 Vote recode

Code
d2022 <- d2022 %>% 
  mutate(hvote = B9_1) %>%
  mutate(
    hvote_collapsed = case_when(
      hvote == "Liberal" ~ "Coalition",
      hvote == "Liberal National Party of Queensland" ~ "Coalition",
      hvote == "Country Liberals (NT)" ~ "Coalition",
      hvote == "National Party" ~ "Coalition",
      hvote == "Labor" ~ "Labor",
      hvote == "Labor Party (ALP)" ~ "Labor",
      hvote == "Greens" ~ "Greens",
      hvote %in% c("No party","Independent") ~ "Independent",
      hvote == "Not eligible to vote" ~ "Not eligible to vote",
      hvote == "Voted informal / Did not vote" ~ "Not vote or informal",
      hvote == "Did not vote" ~ "Not vote or informal",
      hvote %in% c(
        "Refused",
        "Don't know",
        "No answer",
        "Item skipped",
        "Does not apply"
      ) ~ NA_character_,
      ##hvote == "No party" ~ NA_character_,
      is.na(hvote) ~ NA_character_,
      TRUE ~ "Other"
    ),
    
    hvote_collapsed = factor(
      hvote_collapsed
    )
  )

kable(
  xtabs(
    ~ hvote + hvote_collapsed,
    subset = NULL,
    exclude = NULL,
    data = d2022
    )
) %>% 
  kable_styling(full_width = TRUE)
Coalition Greens Independent Labor Other
Liberal 800 0 0 0 0
Labor 0 0 0 917 0
National Party 75 0 0 0 0
Greens 0 348 0 0 0
Other party (please specify) 0 0 0 0 6
No party 0 0 61 0 0
Australian Democrats 0 0 0 0 0
Christian Democratic Party 0 0 0 0 0
Citizens Electoral Council 0 0 0 0 0
Family First Party 0 0 0 0 2
Pauline Hanson's One Nation 0 0 0 0 30
Republican Party (replaced by Republican Party of Australia) 0 0 0 0 1
Shooters, Fishers and Farmers Party 0 0 0 0 2
Fishing Party 0 0 0 0 1
United Australia Party (formerly Palmer's United Party) 0 0 0 0 20
Katter's Australia Party 0 0 0 0 1
Liberal Democrats 0 0 0 0 11
Motoring Enthusiasts Party 0 0 0 0 0
Australian Sports Party (dissolved in 2015) 0 0 0 0 0
Reason Party (formerly The Australian Sex Party) 0 0 0 0 1
The Wikileaks Party (dissolved in 2015) 0 0 0 0 0
Australian Christians 0 0 0 0 2
Derryn Hinch's Justice Party 0 0 0 0 1
Centre Alliance (formerly Nick Xenophon Team) 0 0 0 0 2
Rise Up Australia 0 0 0 0 0
Science Party 0 0 0 0 0
Australian Liberty Alliance 0 0 0 0 0
Pirate Party 0 0 0 0 0
Jacquie Lambie Network 0 0 0 0 0
Arts Party 0 0 0 0 0
Animal Justice Party 0 0 0 0 4
Australian Cyclists Party 0 0 0 0 0
Health Australia Party 0 0 0 0 0
Affordable Housing Party 0 0 0 0 0
Australia First Party 0 0 0 0 1
Australian Better Families 0 0 0 0 0
Australian Conservatives 0 0 0 0 0
Australian People's Party 0 0 0 0 0
Australian Progressives 0 0 0 0 0
Australian Workers Party 0 0 0 0 0
Child Protection Party 0 0 0 0 0
Climate Action! Immigration Action! Accountable Politicians! 0 0 0 0 0
Country Liberals (NT) 0 0 0 0 0
Democratic Labour Party 0 0 0 0 1
Fraser Anning'S Conservative National Party 0 0 0 0 0
Help End Marijuana Prohibition (HEMP) Party 0 0 0 0 0
Independents For Climate Action Now 0 0 0 0 0
Involuntary Medication Objectors (Vaccination/Fluoride) Party 0 0 0 0 0
Labour DLP 0 0 0 0 0
Liberal National Party of Queensland 0 0 0 0 0
Love Australia or Leave 0 0 0 0 0
Non-Custodial Parents Party (Equal Parenting) 0 0 0 0 0
Secular Party of Australia 0 0 0 0 0
Seniors United Party of Australia 0 0 0 0 0
Socialist Alliance 0 0 0 0 1
Socialist Equality Party 0 0 0 0 0
Sustainable Australia 0 0 0 0 0
The Australian Mental Health Party 0 0 0 0 0
The Great Australian Party 0 0 0 0 3
The Small Business Party 0 0 0 0 0
The Together Party 0 0 0 0 0
Victorian Socialists 0 0 0 0 0
VOTEFLUX.ORG | Upgrade Democracy! 0 0 0 0 0
WESTERN AUSTRALIA PARTY 0 0 0 0 1
Yellow Vest Australia 0 0 0 0 0
Australian Citizens Party 0 0 0 0 0
Australian Federation Party 0 0 0 0 2
Australian Values Party 0 0 0 0 0
David Pocock 0 0 0 0 1
Drew Pavlou Democratic Alliance 0 0 0 0 0
FUSION: Science, Pirate, Secular, Climate Emergency 0 0 0 0 2
Federal ICAC Now 0 0 0 0 0
Indigenous - Aboriginal Party of Australia 0 0 0 0 1
Informed Medical Options Party 0 0 0 0 1
Rex Patrick Team 0 0 0 0 0
TNL 0 0 0 0 0
The Local Party of Australia 0 0 0 0 0
Swing Voter 0 0 0 0 0
Independent 0 0 129 0 0
Other party (not specified) 0 0 0 0 7
Does not apply 0 0 0 0 0
Item skipped 0 0 0 0 0

3.3 Collapse religion

Code
d2022 <- d2022 %>%
  mutate(
    RELP_collapsed = case_when(
      grepl("Catholic", H6) ~ "Catholic",
      H6 == "Baptist" ~ "Protestant",
      H6 %in% c(
        "Churches of Christ",
        "Latter Day Saints",
        "Lutheran",
        "Salvation Army",
        "Pentecostalism",
        "Christian (Specified)",
        "Seventh Day Adventist",
        "Other Protestant",
        "Other Christian"
      ) ~ "Other Christian",
      H6 == "Buddhist" ~ "Buddhism",
      H6 == "Anglican / Church of England" ~ "Protestant",
      H6 == "Hebrew/Jewish" ~ "Judaism and Other Religions",
      H6 == "Muslim" ~ "Islam",
      H6 == "Other Non-Christian" ~ "Judaism and Other Religions",
      H6 == "Hindu" ~ "Hinduism",
      H6 == "Uniting Church / Methodist" ~ "Protestant",
      H6 == "Orthodox Church" ~ "Greek Orthodox",
      H6 == "Presbyterian" ~ "Presbyterian",
      H6 == "Other (please specify)" ~ "Judaism and Other Religions",
      H6 == "No religion" ~ "No Religion (so described)",
      H6 == "Item skipped" ~ NA,
      is.na(H6) ~ NA
    ),
    
    RELP_collapsed = factor(RELP_collapsed)
  )

3.4 Miscellaneous others, recoded to match CURF

Code
d2022 <- d2022 %>%
  mutate(
    
    # sex
    SEXP = dem_gender,
    
    ## Indigenous
    INGP_binary = as.character(H4 == "Yes"),
    
    ## EDUCATION
    G3 = factor(
      G3,
      levels = c(
        "No qualification since leaving school",
        "Trade qualification",
        "Non-trade qualification",
        "Associate Diploma",
        "Undergraduate Diploma",
        "Bachelor Degree (including Honours)",
        "Postgraduate Degree or Postgraduate Diploma"
      ),
      ordered = TRUE
    ),
    
    ## INCOME
    INCP = case_when(
      J6 %in% levels(J6)[1:7] ~ levels(curf$INCP)[as.numeric(J6)],
      J6 %in% levels(J6)[8:10] ~ "$104,000-$155,999",
      J6 %in% levels(J6)[11:13] ~ "$156,000>",
      J6 == "Item skipped" ~ NA
      ),
    
     INCP = factor(
      INCP,
      levels = levels(curf$INCP),
      labels = levels(curf$INCP),
      ordered = TRUE
    ), 
    
    J6 = factor(
      J6,
      levels = setdiff(names(levs$J6), "Item skipped"),
      ordered = TRUE
    ),
    J6_continuous = as.numeric(J6),
    
    BPLP = case_when(
      H3_1 == "Australia" ~ "Australia",
      H3_1 == "Australia (includes External Territories)" ~ "Australia",
      H3_1 %in% c("1200", "New Zealand") ~ "New Zealand",
      H3_1 == "2200" ~ "North-West Europe (excl. United Kingdom, Channel Islands, Isle of Man, Germany)",
      H3_1 == "United Kingdom, Channel Islands and Isle of Man" ~ "United Kingdom, Channel Islands & Isle of Man",
      H3_1 == "Item skipped" ~ NA,
      H3_1 == "India" ~ "India",
      H3_1 == "South Africa" ~ "Sub-Saharan Africa",
      H3_1 == "Philippines" ~ "Philippines",
      H3_1 == "China (excludes SARs and Taiwan)" ~ "China (excl. SARs and Taiwan)",
      H3_1 == "Vietnam" ~ "Vietnam",
      H3_1 == "Germany" ~ "Germany",
      H3_1 == "Netherlands" ~ "North-West Europe (excl. United Kingdom, Channel Islands, Isle of Man, Germany)",
      H3_1 == "Malaysia" ~ "South-East Asia (excl. Vietnam, Philippines)",
      H3_1 == "Italy" ~ "Italy",
      
      H3_1 %in% c("Sri Lanka", "Pakistan", "Nepal", "Afghanistan", "Tajikistan") ~
        "Southern and Central Asia (excl. India)",
      
      H3_1 == "Canada" ~ "Americas",
      H3_1 == "Hong Kong (SAR of China)" ~ "North-East Asia (excl. China)",
      H3_1 == "Ireland" ~ "North-West Europe (excl. United Kingdom, Channel Islands, Isle of Man, Germany)",
      H3_1 == "Greece" ~ "Greece",
      H3_1 == "Singapore" ~ "South-East Asia (excl. Vietnam, Philippines)",
      H3_1 == "Malta" ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      H3_1 == "Indonesia" ~ "South-East Asia (excl. Vietnam, Philippines)",
      H3_1 == "Taiwan" ~ "North-East Asia (excl. China)",
      H3_1 == "Fiji" ~ "Oceania and Antarctica (excl. Australia, New Zealand)",
      H3_1 == "Croatia" ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      H3_1 == "Hungary" ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      H3_1 == "Mauritius" ~ "Sub-Saharan Africa",
      
      H3_1 %in% c(
        "Ghana",
        "Lesotho",
        "Kenya",
        "Nigeria",
        "Tanzania",
        "Uganda",
        "Zimbabwe",
        "Zambia"
      ) ~ "Sub-Saharan Africa",
      
      H3_1 == "Poland" ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      
      H3_1 %in% c("Egypt", "Israel", "Lebanon", "Jordan", "Kuwait", "Syria") ~ "North Africa and the Middle East",
      
      H3_1 %in% c("Japan", "Japan and the Koreas", "Mongolia") ~ "North-East Asia (excl. China)",
      
      H3_1 == "Korea, Republic of (South)" ~ "North-East Asia (excl. China)",
      H3_1 == "Bangladesh" ~ "Southern and Central Asia (excl. India)",
      
      H3_1 %in% c(
        "United States of America",
        "Brazil",
        "Chile",
        "Mexico",
        "Peru",
        "Argentina",
        "Columbia",
        "Paraguay",
        "Venezuela, Bolivarian Republic of",
        "Nicaragua",
        "Bolivia, Plurinational State of",
        "Uruguay"
      ) ~ "Americas",
      
      H3_1 == "Inadequately described" ~ "Inadequately described, At sea, Not stated",
      H3_1 == "Papua New Guinea" ~ "Oceania and Antarctica (excl. Australia, New Zealand)",
      
      H3_1 %in% c("England", "Northern Ireland", "Wales", "Scotland", "Guernsey") ~
        "United Kingdom, Channel Islands & Isle of Man",
      
      H3_1 %in% c("Austria", "Switzerland") ~ "North-West Europe (excl. United Kingdom, Channel Islands, Isle of Man, Germany)",
      
      H3_1 == "Serbia" ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      H3_1 == "Ukraine" ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      H3_1 == "Iran" ~ "North Africa and the Middle East",
      H3_1 == "Turkey" ~ "North Africa and the Middle East",
      
      H3_1 %in% c("Laos", "Thailand") ~ "South-East Asia (excl. Vietnam, Philippines)",
      
      H3_1 %in% c("Former USSR, nfd", "Former Yugoslavia, nfd") ~ "Southern and Eastern Europe (excl. Italy, Greece)",
      
      H3_1 %in% c(
        "Spain",
        "Portugal",
        "Cyprus",
        "Romania",
        "Estonia",
        "Russian Federation",
        "Slovakia",
        "Former Yugoslav Republic of Macedonia (FYROM)"
      ) ~
        "Southern and Eastern Europe (excl. Italy, Greece)",
      
      H3_1 == "Sudan" ~ "North Africa and the Middle East",
      H3_1 == "Myanmar, The Republic of the Union of" ~ "South-East Asia (excl. Vietnam, Philippines)"
    ),
    
    
    born_country_three = case_when(
      BPLP == "Australia" ~ "Australia",
      BPLP %in% c(
        "United Kingdom, Channel Islands & Isle of Man",
        "New Zealand") ~ "UK/NZ",
      is.na(H3_1) | H3_1 == "Item skipped" | BPLP == "Inadequately described, At sea, Not stated" ~ NA,
      TRUE ~ "Other"
    ),
    
    born_country_three = factor(born_country_three),
    
    ANCP_collapse = if_else(H4a == "Other (please specify)","Other",H4a),
    ANCP_collapse = if_else(H4a == "Item skipped",NA,ANCP_collapse),
    ANCP_collapse = factor(ANCP_collapse),
    
    MSTP = case_match(
      H8,
      "Never married" ~ "Never married",
      "Now married (including de facto relationships)" ~ "Married",
      "Divorced or separated" ~ "Divorced or separated",
      "Widowed" ~ "Widowed",
      ),
    
    MSTP = factor(MSTP),
    
    STATEUR = case_match(
      dem_state,
      "New South Wales" ~ "NSW",
      "Victoria" ~ "VIC",
      "Queensland" ~ "QLD",
      "South Australia" ~ "SA",
      "Western Australia" ~ "WA",
      "Tasmania" ~ "TAS",
      "Northern Territory" ~ "NT",
      "Australian Capital Territory" ~ "ACT"
    ),
    
    AGE = as.character(AGE),
    
    AGEP = case_when(
      AGE %in% as.character(0:24) ~ AGE,
      AGE %in% as.character(25:29) ~ "25-29",
      AGE %in% as.character(30:34) ~ "30-34",
      AGE %in% as.character(35:39) ~ "35-39",
      AGE %in% as.character(40:44) ~ "40-44",
      AGE %in% as.character(45:49) ~ "45-49",
      AGE %in% as.character(50:54) ~ "50-54",
      AGE %in% as.character(55:59) ~ "55-59",
      AGE %in% as.character(60:64) ~ "60-64",
      AGE %in% as.character(65:69) ~ "65-69",
      AGE %in% as.character(70:74) ~ "70-74",
      AGE %in% as.character(75:79) ~ "75-79",
      AGE %in% as.character(80:84) ~ "80-84",
      AGE %in% as.character(85:107) ~ "85>",
      AGE == "Item Skipped" ~ NA_character_
    ), 
    
      AGEP = factor(
      AGEP,
      levels = c(
        as.character(
          0:24),
          "25-29",
          "30-34",
          "35-39",
          "40-44",
          "45-49",
          "50-54",
          "55-59",
          "60-64",
          "65-69",
          "70-74",
          "75-79",
          "80-84",
          "85>"),
      ordered = TRUE),
    
    CITP = case_match(
      CITIZEN,
      "Yes" ~ "Australian",
      "No" ~ "Not Australian"
      ),
    
    UNI = case_match(
      G3,
      "Postgraduate Degree or Postgraduate Diploma" ~ "Yes",
      "Bachelor Degree (including Honours)" ~ "Yes",
      .default = "No"
    ),
    
    enrolled = if_else(
      J9 == "I don't know whether I am enrolled",
      NA,
      J9
    )
        
  )

3.5 Missing data codes

Recode Item skipped and Unknown to NA.

Code
missingRecode <- function(x) {
  str <- tolower(as.character(x))
  z <- if_else(
    grepl(
      pattern = "(item skipped)|(unknown)",
      str),
    NA,
    x)
  return(z)
}

d2022 <- d2022 %>% 
  mutate(
    across(
      where(is.factor) | where(is.character), 
      ~missingRecode(.x)
      )
    )

3.6 Geo-coding

Several tasks here:

  • Validate match AES postcodes.
  • Lookup SA4s of AES postcodes.
  • Merge Census POA (postcode) variables and principal components onto AES.

3.6.1 Are there unmatched PCODE (postcodes) in the AES data?

Code
load(here("data/2022/POA/poa_prcomp.RData"))
kable(
  d2022 %>% 
  anti_join(poa_prcomp %>%
              mutate(PCODE = str_remove(pattern = "^POA",POA_CODE_2021)),
            by = "PCODE") %>%
  select(PCODE) %>%
  distinct()
) %>% 
  kable_styling(font_size = 9, full_width = TRUE)
PCODE
0871
1640
8002
6872
3724
1355
0861
2001
6956

We’ll manually hard code these:

Code
postcode_remap <- tribble(
  ~PCODE_ORIG, ~PCODE,
  "0871", "0870",
  "1640", "2086",
  "8002", "3002",
  "6872", "6005",
  "3724", "3722",
  "1355", "2022",
  "0861", "0860",
  "2001", "2000",
  "6956", "6156"
)

d2022 <- d2022 %>% 
  rename(PCODE_ORIG = PCODE) %>% 
  left_join(postcode_remap, by = "PCODE_ORIG") %>% 
  mutate(PCODE = if_else(is.na(PCODE),PCODE_ORIG,PCODE))

3.6.2 Add SA4 agglomerations AREAENUM geocodes to match CURF

Code
d2022 <- d2022 %>% 
  left_join(
        sa_to_postcodes %>%
          select(PCODE = POA_NAME_2021,AREAENUM),
        by = "PCODE"
        ) %>% 
   mutate(
    AREAENUM = factor(
      AREAENUM,
      levels = areaenum_info %>% pull(AREAENUM),
      labels = areaenum_info %>% pull(description),
      ordered = TRUE
      )
    )

3.6.3 POA covariates

We add POA-level Census variables, or rather, the 1st 10 principal components extracted from thousands of POA level Census variables provided by ABS in POA General Community Profiles (GCP); see the analysis code/poa_features.R for details.

We also add various rates and medians provided in Table G02 of the ABS POA GCP files.

Code
load(here("data/2022/POA/poa_prcomp.RData"))
d2022 <- d2022 %>% 
  left_join(poa_prcomp %>%
              mutate(PCODE = str_remove(POA_CODE_2021,"POA")),
            by = "PCODE")

3.7 Save the copy of d2022 we have at this stage

Code
save(d2022,file=here("data/2022/aes/d2022_recoded.RData"))

3.8 Searchable listing of variables in the AES:

Code
ojs_define(d2022_meta_raw = d2022_meta)
ojs_define(levs_long_raw = levs_long)
Code
d2022_meta = transpose(d2022_meta_raw)
viewof d2022_search = Inputs.search(d2022_meta)
Inputs.table(d2022_search,
  {
    header: {
      var: "variable"
    },
    rows: 10
  }
)

3.9 AES variables with CURF analogues

We revisit this list as we re-code AES vars to match CURF coding. The general idea will be to create a variable in the AES data with the same name and coding scheme as its CURF analogue.

Code
aes_var_list <- tribble(
  ~name,              ~aes_var,          ~curf_var,   
  "state",            "STATEUR",         "STATEUR",   
  "age",              "AGEP",            "AGEP",
  "ancestry",         "ANCP_collapse",   "ANCP_collapse",
  "citizen",          "CITP",            "CITP",
  "country_of_birth", "born_country_three",          "born_country_three",
  "gender",           "SEXP",            "SEXP",
  "university",       "UNI",             "UNI",
  "indigenous",       "INGP_binary",     "INGP_binary",
  "religion",         "RELP_collapsed",  "RELP_collapsed",
  "marital_status",   "MSTP",            "MSTP",
  "income",           "INCP",            "INCP",
  "occupation",       "G5Code",          "OCCP",
  "SA4 grouping",     "AREAENUM",        "AREAENUM"
)

ojs_define(aes_var_list_raw = aes_var_list %>% as.data.frame())
Code
ojs_define(d2022_subset_raw =
             d2022 %>%
             select(aes_var_list %>% pull(aes_var)
                    ) %>% 
             as.data.frame()
           ) 
ojs_define(d2022_subset_recoded_raw =
             d2022 %>%
             select(any_of(aes_var_list %>% pull(curf_var))) %>%
             as.data.frame()
           ) 
Code
aes_var_list = transpose(aes_var_list_raw)
d2022_subset = transpose(d2022_subset_raw)
d2022_subset_recoded = transpose(d2022_subset_recoded_raw)
levs_long = transpose(levs_long_raw)

viewof aes_curf_join_selection = Inputs.select(
  aes_var_list,
  {
    value: d => d.name,
    format: d => d.name,
    label: "Select variable:"
    }
)

AES marginal:

Code
aes_curf_comp = aes_var_list
  .filter(d => d.name == aes_curf_join_selection.name)

theAESVar = aes_curf_comp.map(d => d.aes_var).toString()
theCURFVar = aes_curf_comp.map(d => d.curf_var).toString()

aes_freq = aq.from(d2022_subset)
  .groupby(theAESVar)
  .count()
  .rename(aq.names(["label","n"]))
  .derive({var: aq.escape(theAESVar)})
  .derive({p: d => d.n/op.sum(d.n)})
  .join_left(aq.from(levs_long))
  .orderby("value")
  .select("var","label","n","p")

// AES tab
Inputs.table(aes_freq,
  {
    header: {
      p: "%"
    },
    format: {
      p: x => d3.format(".1%")(x)
    },
    rows: aes_freq.numRows(),
    maxHeight: (aes_freq.numRows()+1.5)*28
  }
)

CURF marginals:

Code
tab2 = curf_marginals_aq
  .join_left(curf_meta_aq)
  .filter(aq.escape(d => d.var == theCURFVar))
  .derive({var: aq.escape(theCURFVar)})
  .rename(aq.names(["var","label"]))
  .select(["var","label","n","p"])

  
// CURF tab
Inputs.table(tab2,
  {
    header: {
      p: "%"
    },
    format: {
      p: x => d3.format(".1%")(x)
    },
    rows: tab2.numRows(),
    maxHeight: (tab2.numRows()+1.5)*28
  }
)

4 Check alignment between AES and CURF

We now examine the alignment between AES and CURF categories. We restrict this analysis of the CURF to records from adult Australian residents. Likewise we discard observations from

Code
load(here("data/2022/aes/d2022_recoded.RData"))
theVars <- intersect(
  colnames(d2022),
  colnames(curf)
  )
theVars <- theVars[theVars != "STATEUR"]
theVars <- c(theVars,"hvote_collapsed","enrolled")
save(theVars,file=here("data/2022/curf/theVars.RData"))
combined <- 
  bind_rows(
    d2022 %>%
      select(all_of(theVars),
             hvote_collapsed) %>% 
      filter(CITP != "Not Australian") %>% 
      mutate(source = "AES"),
    curf %>% 
      filter(!(AGEP %in% as.character(0:17))) %>% 
      filter(CITP != "Overseas visitor") %>% 
      filter(CITP != "Not Australian") %>% 
      mutate(source = "CURF")
  )

4.1 Drop unused levels on INCP and AGEP

Code
combined <- combined %>% 
  mutate(INCP = fct_drop(INCP),
         AGEP = fct_drop(AGEP))

4.2 Make AGEP continuous for imputations

Imputing continuous quantity easier than discrete. We will convert back to discrete after imputations:

Code
combined <- combined %>% 
  mutate(
    AGE_continuous = case_when(
      AGEP == "18" ~ 18,
      AGEP == "19" ~ 19,
      AGEP == "20" ~ 20,
      AGEP == "21" ~ 21,
      AGEP == "22" ~ 22,
      AGEP == "23" ~ 23,
      AGEP == "24" ~ 24,
      AGEP == "25-29" ~ 27,
      AGEP == "30-34" ~ 32,
      AGEP == "35-39" ~ 37,
      AGEP == "40-44" ~ 42,
      AGEP == "45-49" ~ 47,
      AGEP == "50-54" ~ 52,
      AGEP == "55-59" ~ 57,
      AGEP == "60-64" ~ 62,
      AGEP == "65-69" ~ 67,
      AGEP == "70-74" ~ 72,
      AGEP == "75-79" ~ 77,
      AGEP == "80-84" ~ 82,
      AGEP == "85>" ~ 87
      )
  )

4.3 Add AREAENUM level covariates

Code
combined <- left_join(
  combined,
  left_join(
    areaenum_covars,
    areaenum_info,
    by = "AREAENUM") 
  %>% 
    mutate(AREAENUM = description) %>% 
    select(-description),
  by = "AREAENUM"
)

4.4 Write to file

Code
write_fst(combined,
          path = here("data/2022/curf/aes_combined.fst"))

4.5 Compute summary stats for inspection

Code
combined_marginals <- combined %>% 
  select(-starts_with("PC"),-ends_with("_G02"),-AGE_continuous) %>% 
  pivot_longer(cols = !(source),
               names_to = "var",
               values_to = "value") %>% 
  count(var,source,value) %>%
  group_by(var,source) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup() %>% 
  pivot_wider(id_cols = c("var","value"),
              names_from = "source",
              values_from = c("n","p"))

levelSets <- list()
for (v in theVars) {
  thisVar <- d2022 %>% pull({{v}})
  levs <- if(is.factor(thisVar)){
    levels(thisVar)
  } else {
    sort(unique(thisVar))
  }
  
  levelSets[[v]] <- tibble(value = levs) %>% mutate(indx = 1:n())

  }

levelSets <- bind_rows(levelSets,.id = "var")

combined_marginals <- left_join(combined_marginals,
                                levelSets,
                                by = c("var","value")) %>%
  arrange(var,indx)
Code
ojs_define(combined_marginals_raw = combined_marginals)
ojs_define(theVars_raw = theVars)
Code
combined_marginals = transpose(combined_marginals_raw)

viewof combined_selection = Inputs.select(
  theVars_raw,
  {
    label: "Select variable:"
  }
)

combined_tab = aq.from(combined_marginals)
  .filter(aq.escape(d => d.var == combined_selection))

Inputs.table(
  combined_tab,
  {
    columns: [ "value", "n_AES", "p_AES", "n_CURF", "p_CURF" ],
    header: {
      n_AES: "n AES",
      n_CURF: "n CURF",
      p_AES: "AES %",
      p_CURF: "CURF %"
    },
    format: {
      p_AES: x => d3.format(".1%")(x),
      p_CURF: x => d3.format(".1%")(x)
    },
    sort: "indx",
    rows: combined_tab.numRows(),
    maxHeight: (combined_tab.numRows()+1.5)*28
  }
)

5 Joint imputation of missing data

Done in a batch job, aes_curf_impute.R.

Call mice for imputing missing:

Code
combined <- read_fst(here("data/2022/curf/aes_combined.fst"))

library(mice)
combined_imp <- futuremice(
  combined %>% 
    select(starts_with("PC"),
           ends_with("G02"),
           any_of(theVars),
           "AGE_continuous",
           "hvote_collapsed",
           "enrolled") %>% 
    select(-AREAENUM,-BPLP),
  defaultMethod = c("cart","cart","cart","polr"),
  parallelseed = 314159L,
  m = 8,
  print = TRUE,
  n.core = 8,
  n.imp.core = 1,
  maxit = 25
)

combined_imputed <- complete(combined_imp,action = "long")
write_fst(combined_imputed,here("data/2022/curf/aes_combined_imputed.fst"))

## take most frequently imputed value for missing values of discrete vars
## average for missing values of continuous

# Define the Mode function
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

d2022_imputed_final <- d2022_imputed %>% 
  group_by(.id) %>% 
  summarise(across(where(is.character) | where(is.factor),
                   ~Mode(.x)),
            across(where(is.numeric),
                   ~median(.x))) %>%
  ungroup()

save(d2022_imputed_final,file=here("data/2022/aes/d2022_imputed_final.RData"))

We load the output of the imputations, if available